I used the Kraken2 results with MPA-style based on silva 16S dataset 138.1. Here, I only keep bacteria at family level & CLR transformed (Kraken2 Barbell).
rm(list=ls())
# library(ape)
library(tidyverse)
library(vegan)
library(ggpubr)
library(phyloseq)
library(circlize)
library(ggpmisc) # For regression equation annotation
library(broom) # For extracting p-values
library(RColorBrewer)
mytheme <- theme_light()+
theme(text = element_text(size = 10),
legend.title = element_text(face = "bold", size = 10),
legend.text = element_text(size = 10),
legend.justification = c("right", "top"),
legend.background = element_rect(),
strip.background = element_blank(),
strip.text = element_text(face = "bold", size = 10, color = "black"),
axis.ticks = element_line(colour = "grey70", linewidth = 0.2),
panel.grid.major = element_line(colour = "grey70", linewidth = 0.1),
panel.grid.minor = element_blank())
# Path to the folder containing Kraken2 report files
folder_path <- "div_analysis_family_level/"
# Create the directory if it does not already exist
if (!dir.exists(folder_path)) {
dir.create(folder_path)
}
Use relative abundance table for the analysis.
# read phyloseq object from all the samples - raw reads
ps_rawreads <- readRDS("phyloseq_object_family_rawreads.rds")
otu_table_all <- as.data.frame(otu_table(ps_rawreads))
otu_table <- otu_table_all %>%
select(!grep("Mock", names(otu_table_all)) & !grep("control", names(otu_table_all)) & !grep("FieldBlank", names(otu_table_all)) & !grep("f0_b0", names(otu_table_all))) %>%
select(!c(IGAPark, MittelmoleWarnemunde, NeptunWerft, SABMarinaBramow, Stadthafen)) %>%
rename(f100_b0_f_p0_r1 = Muhlendamm ,
f0_b100_b_p0_r1 = WarnemundeStrand)
otu_table[1:2, 1:2]
## f0_b100_b_p1_r1
## d__Bacteria|p__Abditibacteriota|c__Abditibacteria|o__Abditibacteriales|f__Abditibacteriaceae 0
## d__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Acidobacteriales|f__Acidobacteriaceae (Subgroup 1) 0
## f0_b100_b_p1_r2
## d__Bacteria|p__Abditibacteriota|c__Abditibacteria|o__Abditibacteriales|f__Abditibacteriaceae 0
## d__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Acidobacteriales|f__Acidobacteriaceae (Subgroup 1) 0
# number of families x number of samples
dim(otu_table)
## [1] 357 242
# summary of 240 samples (240 microcosm samples and two field samples)
summary(colSums(otu_table))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3591 11962 19580 22195 30152 83009
# total number of reads from 240 samples
sum(colSums(otu_table))
## [1] 5371097
otu_table_rel <- otu_table %>%
mutate(across(everything(), ~ .x / sum(.x)))
otu_table_rel <- t(otu_table_rel)
otu_table_rel[120:121, c(16, 22)]
## d__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Frankiales|f__Sporichthyaceae
## f25_b75_b_p6_r4 7.753140e-05
## f25_b75_f_p1_r1 2.321371e-05
## d__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Micrococcales|f__Microbacteriaceae
## f25_b75_b_p6_r4 0.000000e+00
## f25_b75_f_p1_r1 2.321371e-05
dim(otu_table_rel)
## [1] 242 357
# get diversity indexes
Shannon <- diversity(otu_table_rel, "shannon")
head(Shannon)
## f0_b100_b_p1_r1 f0_b100_b_p1_r2 f0_b100_b_p1_r3 f0_b100_b_p1_r4 f0_b100_b_p2_r1
## 1.779974 1.722050 1.514423 1.708745 1.465360
## f0_b100_b_p2_r2
## 1.307836
# Simpson <- diversity(t(otu_table_rel), "simpson")
# dim(Simpson)
Richness <- specnumber(otu_table_rel)
head(Richness)
## f0_b100_b_p1_r1 f0_b100_b_p1_r2 f0_b100_b_p1_r3 f0_b100_b_p1_r4 f0_b100_b_p2_r1
## 104 96 98 96 75
## f0_b100_b_p2_r2
## 70
# Pielou's J = H'/ln(S) where H' is Shannon Weiner diversity and S is the total number of species in a sample,
Pielous_evenness <- Shannon/log(Richness)
df <- as.data.frame(cbind(Richness, Shannon)) %>%
rownames_to_column() %>%
separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_") %>%
select(-replicate) %>%
mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("Freshwater", "Seawater"))) %>%
unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>%
mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"), labels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>%
mutate(passage = factor(as.numeric(str_remove(passage, "^p"))))
head(df)
## mix watertype passage Richness Shannon
## 1 0:100 Seawater 1 104 1.779974
## 2 0:100 Seawater 1 96 1.722050
## 3 0:100 Seawater 1 98 1.514423
## 4 0:100 Seawater 1 96 1.708745
## 5 0:100 Seawater 2 75 1.465360
## 6 0:100 Seawater 2 70 1.307836
# write.csv(df, paste0(folder_path, "richness_shannon_all_samples.csv"))
df1 <- df %>%
pivot_longer(-c(mix, watertype, passage), names_to = "div_index") %>%
group_by(mix, watertype, passage, div_index) %>%
summarize(mean = mean(value, na.rm=TRUE), sd = round(sd(value), 3), n = round(length(value), 3))
## `summarise()` has grouped output by 'mix', 'watertype', 'passage'. You can
## override using the `.groups` argument.
df1$se <- df1$sd / sqrt(df1$n) # Calculate standard error of the mean
head(df1)
## # A tibble: 6 × 8
## # Groups: mix, watertype, passage [3]
## mix watertype passage div_index mean sd n se
## <fct> <fct> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 0:100 Freshwater 1 Richness 55.2 9.95 4 4.97
## 2 0:100 Freshwater 1 Shannon 1.00 0.292 4 0.146
## 3 0:100 Freshwater 2 Richness 64.8 17.7 4 8.84
## 4 0:100 Freshwater 2 Shannon 1.24 0.214 4 0.107
## 5 0:100 Freshwater 3 Richness 45 15.3 4 7.65
## 6 0:100 Freshwater 3 Shannon 1.46 0.143 4 0.0715
(p <- ggplot(df1, aes(passage, mean, group =mix)) +
geom_errorbar(aes(ymin = mean-se, ymax = mean+se, color = mix),
width = 0.2, position = position_dodge(width = 0.4), alpha = 0.6) +
geom_line(aes(color = mix), position = position_dodge(width = 0.4), alpha = 0.6) +
geom_point(aes(fill = mix), size = 2, stroke = 0.1, position = position_dodge(width = 0.4), shape = 21, color = "black") +
labs(x = "Passages", y = "", title = "") +
facet_grid(div_index ~ watertype, scales = "free_y") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio \n(fresh:sea)")+
scale_color_brewer(palette = "RdYlBu", name = "Mixing ratio \n(fresh:sea)")+
mytheme)
# ggsave(paste0(folder_path, "richness_shannon_lineplot_family.png"), width = 5.5, height = 3.5, p)
Here we did PCA analysis based on clr transformed table. > “Aitchison distance has numerous other desirable properties, such as scale invariance, negating the need to perform rarefaction. This feature is critical when one lacks access to absolute microbial abundance, because scale variant distances ensure equivalence between distances computed from absolute and relative abundance measurements.”
# read phyloseq object from all the samples - CLR transformed
ps_clr <- readRDS("phyloseq_object_family_clr.rds")
ps_clr
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 357 taxa and 272 samples ]
## sample_data() Sample Data: [ 272 samples by 8 sample variables ]
## tax_table() Taxonomy Table: [ 357 taxa by 5 taxonomic ranks ]
# only keep the samples from microcosm experiment
ps_microcosm <- subset_samples(ps_clr, catagery == "microcosm")
# remove taxa do not appear in any samples
ps_microcosm <- filter_taxa(ps_microcosm, function(x) sum(x) != 0, TRUE)
# check
ps_microcosm
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 343 taxa and 240 samples ]
## sample_data() Sample Data: [ 240 samples by 8 sample variables ]
## tax_table() Taxonomy Table: [ 343 taxa by 5 taxonomic ranks ]
tail(sample_data(ps_microcosm))
## SampleNO catagery sub_catagery freshwaterCom brackishwaterCom
## f50_b50_f_p4_r4 166 microcosm microcosm f50 b50
## f75_b25_f_p4_r3 169 microcosm microcosm f75 b25
## f0_b100_f_p3_r2 114 microcosm microcosm f0 b100
## f0_b100_f_p3_r4 116 microcosm microcosm f0 b100
## f75_b25_b_p1_r4 24 microcosm microcosm f75 b25
## f0_b100_f_p4_r4 158 microcosm microcosm f0 b100
## watertype passage replicate
## f50_b50_f_p4_r4 f p4 r4
## f75_b25_f_p4_r3 f p4 r3
## f0_b100_f_p3_r2 f p3 r2
## f0_b100_f_p3_r4 f p3 r4
## f75_b25_b_p1_r4 b p1 r4
## f0_b100_f_p4_r4 f p4 r4
# prepare feature table for further analysis
com <- as.data.frame(otu_table(t(ps_microcosm)))
# make col.names shorter
colnames(com) <- sub(".*(f__)", "\\", colnames(com))
com[1:3, 1:3]
## Abditibacteriaceae Acidobacteriaceae (Subgroup 1) uncultured
## f25_b75_f_p2_r1 -1.9941169 0 0
## f25_b75_f_p3_r1 0.3209210 0 0
## f25_b75_f_p4_r1 -0.5822332 0 0
###########################
# PCA
###########################
# Aitchison distance is simply the Euclidean distance between samples after clr transformation,
pca_result <- rda(com)
sum_pca <- summary(pca_result)
sum_pca$cont$importance[2, 1]
## [1] 0.2304891
sum_pca$cont$importance[2, 2]
## [1] 0.09965272
biplot(pca_result, scaling = 2) # Scaling options: 1 (focus on variables), 2 (focus on sites)
# Extract species scores (variable contributions) from PCA
species_scores <- as.data.frame(scores(pca_result, display = "species"))
# Set the threshold for a feature/family thet longer than certain values at PC1 or PC2
threshold <- 1
# Filter species based on PC1 or PC2 exceeding the threshold
filtered_species <- species_scores[abs(species_scores$PC1) > threshold | abs(species_scores$PC2) > threshold, ]
head(filtered_species)
## PC1 PC2
## Cyclobacteriaceae -1.3203459 0.0008841836
## Spirosomaceae 1.7101496 0.9450120874
## Sphingobacteriaceae 1.3160709 0.3364114546
## Bacillaceae 0.2737779 -1.1774890817
## Planococcaceae 0.3809445 -1.2688189384
## Aeromonadaceae -0.6909358 2.0807396685
row.names(filtered_species)
## [1] "Cyclobacteriaceae" "Spirosomaceae" "Sphingobacteriaceae"
## [4] "Bacillaceae" "Planococcaceae" "Aeromonadaceae"
## [7] "Alteromonadaceae" "Colwelliaceae" "Pseudoalteromonadaceae"
## [10] "Shewanellaceae" "Comamonadaceae" "Oxalobacteraceae"
## [13] "Enterobacteriaceae" "Yersiniaceae" "Marinomonadaceae"
## [16] "Vibrionaceae"
# Site scores (coordinates of samples in PCA space)
df <- as.data.frame(scores(pca_result, display = "sites"))
# Add metadata
df <- df %>%
rownames_to_column() %>%
separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_", remove = FALSE) %>%
# select(-replicate) %>%
mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("Freshwater", "Seawater"))) %>%
unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>%
mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"), labels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>%
mutate(passage = factor(as.numeric(str_remove(passage, "^p"))))
# mutate(passage = factor(passage, levels = c("p0", "p1", "p2", "p3", "p4", "p5", "p6")))
head(df)
## rowname mix watertype passage replicate PC1 PC2
## 1 f25_b75_f_p2_r1 25:75 Freshwater 2 r1 0.4462170 1.91802547
## 2 f25_b75_f_p3_r1 25:75 Freshwater 3 r1 1.1792167 0.65538701
## 3 f25_b75_f_p4_r1 25:75 Freshwater 4 r1 0.9235986 -0.08924358
## 4 f25_b75_f_p5_r1 25:75 Freshwater 5 r1 0.6120449 0.96547827
## 5 f25_b75_f_p6_r1 25:75 Freshwater 6 r1 0.9405888 0.08467159
## 6 f50_b50_f_p6_r1 50:50 Freshwater 6 r1 0.7155475 0.86077381
# only color by water
(p1 <- ggplot(df, aes(PC1, PC2)) +
geom_point(aes(fill = watertype, shape = watertype), size = 2.5, stroke = 0.1) +
labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""),
y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""),
title = "") +
scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Culture water") +
scale_shape_manual(values = c(22, 21), name = "Culture water") +
mytheme)
# facet by mixing ratios
(p2 <- ggplot(df, aes(PC1, PC2, shape = watertype, fill = mix)) +
geom_point(size = 2.5, alpha = 0.6, stroke = 0.1) +
facet_grid(watertype ~ passage) +
labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""),
y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""),
title = "") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)")+
scale_shape_manual(values = c(22, 21), name = "Culture water") +
mytheme)
# facet by passage
(p3 <- ggplot(df, aes(PC1, PC2, shape = watertype, fill = passage)) +
geom_point(size = 2.5, alpha = 0.6, stroke = 0.1) +
facet_grid(watertype ~ mix) +
labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""),
y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""),
title = "") +
scale_fill_brewer(palette = "YlGn", guide=guide_legend(override.aes = list(shape=21)), name = "Passages")+
scale_shape_manual(values = c(22, 21), guide = "none") +
mytheme)
# (p23 <- ggarrange(p2, p3, labels = c("A", "B"), font.label = list(size = 10), widths = c(1, 0.7), nrow = 2, ncol = 1))
# ggsave(paste0(folder_path, "PCA_all_combined_facet_clr_no_p0_legend.pdf"), width = 8, height = 6, p23)
# PCA plot with family names when their length at PC1 or PC2 > threshold
(p4 <- ggplot(df, aes(PC1, PC2)) +
geom_point(aes(fill = mix, shape = watertype), size = 3.5, alpha = 0.6, stroke = 0.1) +
geom_text(aes(label = passage), size = 1.2, alpha = 0.4, color = "black") +
geom_segment(data = filtered_species, aes(x = 0, y = 0, xend = PC1, yend = PC2), arrow = arrow(length = unit(0.2, "cm")), linewidth = 0.1, color = "black", alpha = 0.6) +
geom_text(data = filtered_species, aes(x = PC1*0.95, y = PC2*0.95, label = rownames(filtered_species)), color = "black", alpha = 0.8, size = 3, vjust = -0.5) +
labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), title = "") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)")+
scale_shape_manual(values = c(22, 21), name = "Culture water") +
mytheme)
# ggsave(paste0(folder_path,"PCA_water_family_kraken_barbell_clr_water_mix_no_p0_raw.pdf"), width = 5, height = 3.5, p4)
# make plot without arrows
summary(df)
## rowname mix watertype passage replicate
## Length:240 0:100:48 Freshwater:120 1:40 Length:240
## Class :character 25:75:48 Seawater :120 2:40 Class :character
## Mode :character 50:50:48 3:40 Mode :character
## 75:25:48 4:40
## 100:0:48 5:40
## 6:40
## PC1 PC2
## Min. :-1.4623 Min. :-2.29297
## 1st Qu.:-0.9055 1st Qu.:-0.68410
## Median : 0.2004 Median : 0.09374
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8873 3rd Qu.: 0.61582
## Max. : 1.4795 Max. : 1.91803
# make the bundary
min_x = min(df$PC1) + 0.2
max_x = max(df$PC1) + 0.2
min_y = min(df$PC2) + 0.2
max_y = max(df$PC2) + 0.2
colors <- brewer.pal(n = 11, name = "RdYlBu")
print(colors)
## [1] "#A50026" "#D73027" "#F46D43" "#FDAE61" "#FEE090" "#FFFFBF" "#E0F3F8"
## [8] "#ABD9E9" "#74ADD1" "#4575B4" "#313695"
(p5 <- ggplot(df, aes(PC1, PC2)) +
geom_point(aes(fill = mix, shape = watertype), size = 3, alpha = 0.6, stroke = 0.1) +
geom_text(aes(label = passage), size = 2, alpha = 0.4, color = "black") +
labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""),
y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""),
title = "") +
# scale_x_continuous(limits = c(min_x, max_x)) +
# scale_y_continuous(limits = c(min_y, max_y)) +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)") +
scale_shape_manual(values = c(22, 21), name = "Culture water") +
mytheme)
# ggsave(paste0(folder_path, "PCA_water_family_kraken_barbell_clr_water_mix_no_p0_all.png"), width = 4.5, height = 3, p5)
###########################
# permanova
###########################
# Permutational Multivariate Analysis of Variance Using Distance Matrices
dist <- vegdist(com, method = "euclidean")
df <- as.data.frame(com[,1:2]) %>%
rownames_to_column() %>%
select(rowname) %>%
separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_", remove = FALSE) %>%
column_to_rownames(var = "rowname") %>%
mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("freshwater", "seawater"))) %>%
unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>%
mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"))) %>%
mutate(passage = factor(passage, levels = c("p0", "p1", "p2", "p3", "p4", "p5", "p6")))
head(df)
## mix watertype passage replicate
## f25_b75_f_p2_r1 f25_b75 freshwater p2 r1
## f25_b75_f_p3_r1 f25_b75 freshwater p3 r1
## f25_b75_f_p4_r1 f25_b75 freshwater p4 r1
## f25_b75_f_p5_r1 f25_b75 freshwater p5 r1
## f25_b75_f_p6_r1 f25_b75 freshwater p6 r1
## f50_b50_f_p6_r1 f50_b50 freshwater p6 r1
all(rownames(as.matrix(dist)) == rownames(df))
## [1] TRUE
# # three-way permanova
set.seed(123)
# sequential tests
result_whole <- adonis2(dist ~ mix*watertype*passage, data = df, permutation = 999, by = "terms")
#result_whole <- adonis2(com ~ mix*watertype*passage, data = df, method = "euclidean", permutation = 999)
result_whole
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist ~ mix * watertype * passage, data = df, permutations = 999, by = "terms")
## Df SumOfSqs R2 F Pr(>F)
## mix 4 3583 0.08261 7.9334 0.001 ***
## watertype 1 8633 0.19902 76.4549 0.001 ***
## passage 5 2141 0.04936 3.7928 0.001 ***
## mix:watertype 4 2803 0.06462 6.2059 0.001 ***
## mix:passage 20 2094 0.04828 0.9274 0.792
## watertype:passage 5 1465 0.03377 2.5944 0.001 ***
## mix:watertype:passage 20 2333 0.05378 1.0331 0.327
## Residual 180 20325 0.46856
## Total 239 43378 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# write.csv(result_whole, paste0(folder_path, "permanova_water_mix_passage_family_Aitchison.csv"))
To assess the influence of each source community on the coalesced community.
# pairwise dissimilarity
dist <- vegdist(com, method = "euclidean", binary = FALSE, diag = 1)
# Convert distance to dissimilarity in the range [0, 1]
# dissimilarity <- dist / max(dist)
# df <- as.matrix(dissimilarity)
df <- as.matrix(dist)
df <- data.frame(as.table(df))[lower.tri(df, diag = FALSE), ]
cat("For no. of df pairs: Observed = Predict", (240*240-240)/2 == length(df$Freq), "\n")
## For no. of df pairs: Observed = Predict TRUE
row.names(df) <- paste(df$Var1, df$Var2, sep = "_")
summary(df)
## Var1 Var2 Freq
## f0_b100_f_p4_r4: 239 f25_b75_f_p2_r1: 239 Min. : 7.933
## f75_b25_b_p1_r4: 238 f25_b75_f_p3_r1: 238 1st Qu.:16.587
## f0_b100_f_p3_r4: 237 f25_b75_f_p4_r1: 237 Median :19.041
## f0_b100_f_p3_r2: 236 f25_b75_f_p5_r1: 236 Mean :18.788
## f75_b25_f_p4_r3: 235 f25_b75_f_p6_r1: 235 3rd Qu.:21.061
## f50_b50_f_p4_r4: 234 f50_b50_f_p6_r1: 234 Max. :28.842
## (Other) :27261 (Other) :27261
# define source communities, i.e. f100-f & b100-b
df_source <- df[grepl("f100_b0_f", df$Var1) | grepl("f100_b0_f", df$Var2) | grepl("f0_b100_b", df$Var1) | grepl("f0_b100_b", df$Var2), ]
df_source <- df_source %>%
rownames_to_column(var = "name") %>%
separate(name, c("fresh_a", "sea_a", "water_a", "passage_a", "replicate_a", "fresh_b", "sea_b", "water_b", "passage_b", "replicate_b"), "_", remove = FALSE) %>%
filter(passage_a == passage_b) %>%
select(-c(replicate_a, replicate_b, passage_b)) %>%
rename(passage = passage_a) %>%
mutate(passage = factor(gsub("p", "", passage))) %>%
unite(a, c(fresh_a, sea_a), sep = "_", remove = TRUE) %>%
unite(b, c(fresh_b, sea_b), sep = "_", remove = TRUE) %>%
select(-c(Var1, Var2)) %>%
column_to_rownames(var = "name")
dim(df_source)
## [1] 1704 6
tail(df_source)
## a water_a passage b water_b
## f25_b75_b_p5_r2_f0_b100_b_p5_r1 f25_b75 b 5 f0_b100 b
## f25_b75_b_p5_r4_f0_b100_b_p5_r1 f25_b75 b 5 f0_b100 b
## f50_b50_b_p5_r3_f0_b100_b_p5_r1 f50_b50 b 5 f0_b100 b
## f50_b50_f_p5_r2_f0_b100_b_p5_r1 f50_b50 f 5 f0_b100 b
## f50_b50_f_p5_r4_f0_b100_b_p5_r1 f50_b50 f 5 f0_b100 b
## f75_b25_b_p5_r3_f0_b100_b_p5_r1 f75_b25 b 5 f0_b100 b
## Freq
## f25_b75_b_p5_r2_f0_b100_b_p5_r1 13.51981
## f25_b75_b_p5_r4_f0_b100_b_p5_r1 16.69061
## f50_b50_b_p5_r3_f0_b100_b_p5_r1 12.05826
## f50_b50_f_p5_r2_f0_b100_b_p5_r1 16.47511
## f50_b50_f_p5_r4_f0_b100_b_p5_r1 19.06562
## f75_b25_b_p5_r3_f0_b100_b_p5_r1 16.26554
# in freshwater media
f_levels <- c("f100_b0_f", "f75_b25_f", "f50_b50_f", "f25_b75_f", "f0_b100_f", "f0_b100_b")
df_f <- df_source %>%
unite(mix_a, c(a, water_a), sep = "_") %>%
unite(mix_b, c(b, water_b), sep = "_") %>%
mutate(mix_a = factor(mix_a)) %>%
mutate(mix_b = factor(mix_b)) %>%
filter(mix_a %in% f_levels & mix_b %in% f_levels) %>%
unite(mix, c(mix_a, mix_b), sep = "_") %>%
mutate(mix = factor(mix)) %>%
filter(mix != "f0_b100_b_f0_b100_b") %>%
droplevels()
df_f$watertype <- "Freshwater"
levels(df_f$mix)
## [1] "f0_b100_b_f0_b100_f" "f0_b100_b_f100_b0_f" "f0_b100_b_f25_b75_f"
## [4] "f0_b100_b_f50_b50_f" "f0_b100_b_f75_b25_f" "f0_b100_f_f0_b100_b"
## [7] "f0_b100_f_f100_b0_f" "f100_b0_f_f0_b100_b" "f100_b0_f_f0_b100_f"
## [10] "f100_b0_f_f100_b0_f" "f100_b0_f_f25_b75_f" "f100_b0_f_f50_b50_f"
## [13] "f100_b0_f_f75_b25_f" "f25_b75_f_f0_b100_b" "f25_b75_f_f100_b0_f"
## [16] "f50_b50_f_f0_b100_b" "f50_b50_f_f100_b0_f" "f75_b25_f_f0_b100_b"
## [19] "f75_b25_f_f100_b0_f"
# Add source_com and mix_ratio columns
df_f <- df_f %>%
mutate(
source_com = ifelse(grepl("f0_b100_b", mix), "sea", "fresh"),
mix_ratio = case_when(
grepl("f0_b100_f", mix) ~ "0:100",
grepl("f25_b75", mix) ~ "25:75",
grepl("f50_b50", mix) ~ "50:50",
grepl("f75_b25", mix) ~ "75:25",
TRUE ~ "100:0" # Default value
)
) %>%
mutate(mix_simple = paste(mix_ratio, "vs.", source_com))
f_factor_levels <- c("100:0 vs. fresh",
"75:25 vs. fresh",
"50:50 vs. fresh",
"25:75 vs. fresh",
"0:100 vs. fresh",
"100:0 vs. sea",
"75:25 vs. sea",
"50:50 vs. sea",
"25:75 vs. sea",
"0:100 vs. sea")
# I should have 4*4*6 pairs for each coalesced com to a source comunity, for the source community itsefl, which should have 6*6 pairs
df_f <- df_f %>%
mutate(mix_simple = factor(mix_simple, levels = f_factor_levels))
# check results
head(df_f)
## mix passage Freq watertype
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f 2 21.07002 Freshwater
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f 2 21.89174 Freshwater
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 f100_b0_f_f25_b75_f 2 15.56770 Freshwater
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f 2 21.62564 Freshwater
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f 2 21.88233 Freshwater
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 f100_b0_f_f25_b75_f 2 14.00560 Freshwater
## source_com mix_ratio mix_simple
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 sea 25:75 25:75 vs. sea
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 sea 25:75 25:75 vs. sea
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 fresh 25:75 25:75 vs. fresh
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 sea 25:75 25:75 vs. sea
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 sea 25:75 25:75 vs. sea
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 fresh 25:75 25:75 vs. fresh
dim(df_f) # number of rows should be 900 (96*9 + 36)
## [1] 900 7
df_f %>% filter(mix_ratio == "25:75" & source_com == "sea" & passage == "1")
## mix passage Freq watertype
## f25_b75_f_p1_r1_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b 1 24.11564 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b 1 22.36943 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b 1 23.43745 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b 1 22.23588 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b 1 22.04776 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b 1 21.04894 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b 1 21.68552 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b 1 20.41457 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b 1 23.42012 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b 1 22.00670 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b 1 23.00056 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b 1 21.97839 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b 1 23.67377 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b 1 21.85926 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b 1 22.56887 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b 1 21.49220 Freshwater
## source_com mix_ratio mix_simple
## f25_b75_f_p1_r1_f0_b100_b_p1_r1 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r1 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r1 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r1 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r2 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r2 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r2 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r2 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r3 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r3 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r3 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r3 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r4 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r4 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r4 sea 25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r4 sea 25:75 25:75 vs. sea
# in seawater media
s_levels <- c("f0_b100_b", "f25_b75_b", "f50_b50_b", "f75_b25_b", "f100_b0_b", "f100_b0_f")
df_s <- df_source %>%
unite(mix_a, c(a, water_a), sep = "_") %>%
unite(mix_b, c(b, water_b), sep = "_") %>%
mutate(mix_a = factor(mix_a)) %>%
mutate(mix_b = factor(mix_b)) %>%
filter(mix_a %in% s_levels & mix_b %in% s_levels) %>%
unite(mix, c(mix_a, mix_b), sep = "_") %>%
mutate(mix = factor(mix)) %>%
filter(mix != "f100_b0_f_f100_b0_f") %>%
droplevels()
df_s$watertype <- "Seawater"
levels(df_s$mix)
## [1] "f0_b100_b_f0_b100_b" "f0_b100_b_f100_b0_b" "f0_b100_b_f100_b0_f"
## [4] "f0_b100_b_f25_b75_b" "f0_b100_b_f50_b50_b" "f0_b100_b_f75_b25_b"
## [7] "f100_b0_b_f0_b100_b" "f100_b0_b_f100_b0_f" "f100_b0_f_f0_b100_b"
## [10] "f100_b0_f_f100_b0_b" "f100_b0_f_f25_b75_b" "f100_b0_f_f50_b50_b"
## [13] "f100_b0_f_f75_b25_b" "f25_b75_b_f0_b100_b" "f25_b75_b_f100_b0_f"
## [16] "f50_b50_b_f0_b100_b" "f50_b50_b_f100_b0_f" "f75_b25_b_f0_b100_b"
## [19] "f75_b25_b_f100_b0_f"
# Add source_com and mix_ratio columns
df_s <- df_s %>%
mutate(
source_com = ifelse(grepl("f100_b0_f", mix), "fresh", "sea"),
mix_ratio = case_when(
grepl("f100_b0_b", mix) ~ "100:0",
grepl("f25_b75", mix) ~ "25:75",
grepl("f50_b50", mix) ~ "50:50",
grepl("f75_b25", mix) ~ "75:25",
TRUE ~ "0:100" # Default value
)
) %>%
mutate(mix_simple = paste(mix_ratio, "vs.", source_com))
b_factor_levels <- c("100:0 vs. fresh",
"75:25 vs. fresh",
"50:50 vs. fresh",
"25:75 vs. fresh",
"0:100 vs. fresh",
"100:0 vs. sea",
"75:25 vs. sea",
"50:50 vs. sea",
"25:75 vs. sea",
"0:100 vs. sea")
df_s <- df_s %>%
mutate(mix_simple = factor(mix_simple, levels = b_factor_levels))
# check results
dim(df_s) # number of rows should be 900 (96*9 + 36)
## [1] 900 7
df_s %>% filter(mix_ratio == "0:100" & source_com == "sea" & passage == "1")
## mix passage Freq watertype
## f0_b100_b_p1_r2_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b 1 14.75785 Seawater
## f0_b100_b_p1_r3_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b 1 12.62805 Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b 1 13.40322 Seawater
## f0_b100_b_p1_r3_f0_b100_b_p1_r2 f0_b100_b_f0_b100_b 1 12.89964 Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r2 f0_b100_b_f0_b100_b 1 12.42332 Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r3 f0_b100_b_f0_b100_b 1 13.14045 Seawater
## source_com mix_ratio mix_simple
## f0_b100_b_p1_r2_f0_b100_b_p1_r1 sea 0:100 0:100 vs. sea
## f0_b100_b_p1_r3_f0_b100_b_p1_r1 sea 0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r1 sea 0:100 0:100 vs. sea
## f0_b100_b_p1_r3_f0_b100_b_p1_r2 sea 0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r2 sea 0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r3 sea 0:100 0:100 vs. sea
# combine results from freshwater and seawater
df_fs <- rbind(df_f, df_s) %>%
select(mix, watertype, mix_simple, mix_ratio, source_com, passage, Freq)
str(df_fs)
## 'data.frame': 1800 obs. of 7 variables:
## $ mix : Factor w/ 36 levels "f0_b100_b_f0_b100_f",..: 3 3 11 3 3 11 11 11 11 11 ...
## $ watertype : chr "Freshwater" "Freshwater" "Freshwater" "Freshwater" ...
## $ mix_simple: Factor w/ 10 levels "100:0 vs. fresh",..: 9 9 4 9 9 4 4 4 4 4 ...
## $ mix_ratio : chr "25:75" "25:75" "25:75" "25:75" ...
## $ source_com: chr "sea" "sea" "fresh" "sea" ...
## $ passage : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ Freq : num 21.1 21.9 15.6 21.6 21.9 ...
cat("I should have these much pairs", (9*16+6)*6*2)
## I should have these much pairs 1800
warning("Check if the row number is right or not?!")
## Warning: Check if the row number is right or not?!
###########################
# box plot
###########################
# compare two source communities
(p <- ggplot(df_fs, aes(passage, Freq, fill = mix_simple)) +
geom_boxplot(alpha = 0.8, position = position_dodge(0.8), outlier.size=-1, linewidth = 0.1) +
geom_point(size = 1, stroke = 0.1, position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8), shape = 21, color = "black", alpha = 0.6) +
facet_grid(watertype~.) +
labs(x = "Passages", y = "Aitchison distance to the native/source communities", title = "") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio vs. native com.\n(fresh:sea)", direction = -1) +
mytheme) #+
# theme(panel.background = element_rect(fill = "lavenderblush1"),
# strip.text = element_blank()))
# ggsave(paste0(folder_path, "distance_to_source_communities_box_plot_raw.pdf"), width = 7, height = 3.5, p)
df_fs1 <- df_fs %>%
mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("freshwater community", "seawater community"))) %>%
mutate(mix_ratio = factor(mix_ratio, levels = c("100:0", "75:25", "50:50", "25:75", "0:100" )))
# compare two culturing water conditions
(p <- ggplot(df_fs1, aes(interaction(watertype, passage), Freq, fill = mix_ratio)) +
geom_boxplot(alpha = 0.8, position = position_dodge(0.8), outlier.size=-1, linewidth = 0.1) +
geom_point(size = 1, stroke = 0.1, position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8), shape = 21, color = "black", alpha = 0.6) +
facet_grid(source_com~.) +
labs(x = "Culturing water x Passages", y = "Aitchison distance to the native communities", title = "") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio vs. native com.\n(fresh:sea)", direction = -1) +
mytheme ) # + theme(axis.text.x = element_text(angle = 45, hjust = 1)))
# ggsave(paste0(folder_path, "distance_to_source_communities_box_plot_source_com_raw.pdf"), width = 9, height = 3.8, p)
###########################
# line plot for all passages
###########################
df_fs1 <- df_fs %>%
separate(mix_ratio, c("fresh_ratio", "sea_ratio"), ":", remove = FALSE) %>%
select(-c(mix, mix_simple))
df_fs1 <- df_fs1 %>%
mutate(source_ratio = if_else(source_com == "fresh", fresh_ratio, sea_ratio))
df_fs1$source_ratio <- as.numeric(as.character(df_fs1$source_ratio))
df_fs1$fresh_ratio <- as.numeric(as.character(df_fs1$fresh_ratio))
head(df_fs1)
## watertype mix_ratio fresh_ratio sea_ratio
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 Freshwater 25:75 25 75
## source_com passage Freq source_ratio
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 sea 2 21.07002 75
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 sea 2 21.89174 75
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 fresh 2 15.56770 25
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 sea 2 21.62564 75
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 sea 2 21.88233 75
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 fresh 2 14.00560 25
# Step 1: Compute p-values for each group
lm_results <- df_fs1 %>%
group_by(watertype, source_com) %>%
do({
model <- lm(Freq ~ source_ratio, data = .)
tidy(model) # Extract regression statistics
}) %>%
filter(term == "source_ratio") %>% # Only keep slope term (not intercept)
mutate(significant = p.value < 0.05) # Mark significant regressions
# Step 2: Merge with original data to keep significant groups only
df_filtered <- df_fs1 %>%
inner_join(lm_results, by = c("watertype", "source_com")) %>%
mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("Freshwater\nSource Community", "Seawater\nSource Community")))
df_fs1 <- df_fs1 %>%
mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("Freshwater\nSource Community", "Seawater\nSource Community")))
head(df_filtered)
## watertype mix_ratio fresh_ratio sea_ratio source_com
## 1 Freshwater 25:75 25 75 Seawater\nSource Community
## 2 Freshwater 25:75 25 75 Seawater\nSource Community
## 3 Freshwater 25:75 25 75 Freshwater\nSource Community
## 4 Freshwater 25:75 25 75 Seawater\nSource Community
## 5 Freshwater 25:75 25 75 Seawater\nSource Community
## 6 Freshwater 25:75 25 75 Freshwater\nSource Community
## passage Freq source_ratio term estimate std.error
## 1 2 21.07002 75 source_ratio -0.004836835 0.002329795
## 2 2 21.89174 75 source_ratio -0.004836835 0.002329795
## 3 2 15.56770 25 source_ratio -0.044714568 0.003839986
## 4 2 21.62564 75 source_ratio -0.004836835 0.002329795
## 5 2 21.88233 75 source_ratio -0.004836835 0.002329795
## 6 2 14.00560 25 source_ratio -0.044714568 0.003839986
## statistic p.value significant
## 1 -2.076078 3.842067e-02 TRUE
## 2 -2.076078 3.842067e-02 TRUE
## 3 -11.644462 2.468113e-27 TRUE
## 4 -2.076078 3.842067e-02 TRUE
## 5 -2.076078 3.842067e-02 TRUE
## 6 -11.644462 2.468113e-27 TRUE
str(df_fs1)
## 'data.frame': 1800 obs. of 8 variables:
## $ watertype : chr "Freshwater" "Freshwater" "Freshwater" "Freshwater" ...
## $ mix_ratio : chr "25:75" "25:75" "25:75" "25:75" ...
## $ fresh_ratio : num 25 25 25 25 25 25 25 25 25 25 ...
## $ sea_ratio : chr "75" "75" "75" "75" ...
## $ source_com : Factor w/ 2 levels "Freshwater\nSource Community",..: 2 2 1 2 2 1 1 1 1 1 ...
## $ passage : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ Freq : num 21.1 21.9 15.6 21.6 21.9 ...
## $ source_ratio: num 75 75 25 75 75 25 25 25 25 25 ...
# Step 3: line plot
# line plot - combine all passages
(p <- ggplot(df_fs1, aes(fresh_ratio, Freq, fill = source_com, color = source_com)) +
geom_point(size = 1.5, alpha = 0.05, position = position_jitter(width = 4)) +
scale_x_continuous( breaks = c(0, 25, 50, 75, 100), labels = c("0:100", "25:75", "50:50", "75:25", "100:0")) +
facet_grid(. ~ watertype ) +
geom_smooth(data = df_fs1, method = "lm", aes(color = source_com), size = 0.5, alpha = 0.3) +
# geom_smooth(method = "lm", aes(color = source_com), alpha = 0.6) + # Match regression line colors
# Plot regression lines only for significant groups
# Add regression equation, R², and p-value
stat_poly_eq(
aes(label = after_stat(paste(..eq.label.., ..rr.label.., ..p.value.label.., sep = "~~~"))),
formula = y ~ x,
parse = TRUE,
size = 2.5) +
labs(x = "Mixing ratio\n(fresh:sea)", y = "Aitchison distance", title = "") +
scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Distance to") +
scale_color_manual(values = c("#4575B4", "#D73027"), name = "Distance to") +
mytheme +
theme(legend.position.inside = c(0.95, 0.95)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# ggsave(paste0(folder_path, "distance_to_source_communities_line_plot_clr_all_raw.pdf"), width = 6, height = 3, p)
####################
# line plot in facet
####################
# Step 1: Compute p-values for each group
lm_results <- df_fs1 %>%
group_by(watertype, passage, source_com) %>%
do({
model <- lm(Freq ~ source_ratio, data = .)
tidy(model) # Extract regression statistics
}) %>%
filter(term == "source_ratio") %>% # Only keep slope term (not intercept)
mutate(significant = p.value < 0.05) # Mark significant regressions
# Step 2: Merge with original data to keep significant groups only
df_filtered <- df_fs1 %>%
inner_join(lm_results %>% filter(significant), by = c("watertype", "passage", "source_com"))
# Step 3: line plot in facet
(p <- ggplot(df_fs1, aes(source_ratio, Freq, fill = source_com, color = source_com)) +
geom_point(size = 2, alpha = 0.2) +
# geom_smooth(method = "lm", aes(color = source_com), alpha = 0.6) + # Match regression line colors
# Plot regression lines only for significant groups
geom_smooth(data = df_filtered, method = "lm", aes(color = source_com), alpha = 0.7) +
# Add regression equation, R², and p-value
stat_poly_eq(
aes(label = after_stat(
paste(..eq.label.., ..rr.label.., ..p.value.label.., sep = "~~~")
)),
formula = y ~ x,
parse = TRUE,
size = 1.8) +
facet_grid(watertype~passage) +
scale_x_continuous(
breaks = c(0, 25, 50, 75, 100), # Define break points
labels = c("0:100", "25:75", "50:50", "75:25", "100:0")) +
labs(x = "", y = "Aitchison distance to the native/source communities", title = "") +
scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Native/source community") +
scale_color_manual(values = c("#4575B4", "#D73027"), name = "Native/source community") +
mytheme +
theme(legend.position = "bottom"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in ci_f_ncp(stat, df1 = df1, df2 = df2, probs = probs): Upper limit
## outside search range. Set to the maximum of the parameter range.
## Warning in compute_group(...): CI computation error: Error in check_output(cint, probs = probs, parameter_range = c(0, 1)): out[1] <= out[2] is not TRUE
# ggsave(paste0(folder_path, "distance_to_source_communities_line_plot_raw.png"), width = 10, height = 4, p)
###########################
# dis to source on two axis
###########################
# reorganize the data.frame
df_dis <- df_fs %>%
select(-c(mix, mix_simple)) %>%
group_by(watertype, mix_ratio, source_com, passage) %>%
mutate(seq_number = row_number()) %>%
pivot_wider(names_from = c(source_com), values_from = Freq) %>%
mutate(mix_ratio = factor(mix_ratio, levels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>%
drop_na()
# check
summary(df_dis)
## watertype mix_ratio passage seq_number sea
## Length:840 0:100:132 1:140 Min. : 1.000 Min. : 7.933
## Class :character 25:75:192 2:140 1st Qu.: 4.000 1st Qu.:15.492
## Mode :character 50:50:192 3:140 Median : 8.000 Median :18.835
## 75:25:192 4:140 Mean : 8.071 Mean :18.104
## 100:0:132 5:140 3rd Qu.:12.000 3rd Qu.:20.751
## 6:140 Max. :16.000 Max. :25.858
## fresh
## Min. : 9.295
## 1st Qu.:15.203
## Median :18.837
## Mean :18.274
## 3rd Qu.:20.951
## Max. :28.249
df_dis %>% filter(watertype == "Freshwater" & mix_ratio == "100:0" & passage == "1") %>% print()
## # A tibble: 6 × 6
## # Groups: watertype, mix_ratio, passage [1]
## watertype mix_ratio passage seq_number sea fresh
## <chr> <fct> <fct> <int> <dbl> <dbl>
## 1 Freshwater 100:0 1 1 23.7 14.8
## 2 Freshwater 100:0 1 2 21.5 15.4
## 3 Freshwater 100:0 1 3 23.2 15.2
## 4 Freshwater 100:0 1 4 22.9 9.95
## 5 Freshwater 100:0 1 5 22.1 11.2
## 6 Freshwater 100:0 1 6 22.4 10.6
# write.csv(df_dis, paste0(folder_path, "distance_to_source_com_check.csv"))
# Compute regression statistics for each combination of watertype and passage
regression_results <- df_dis %>%
group_by(watertype, passage) %>%
summarize(
model = list(lm(sea ~ fresh, data = cur_data())),
p_value = summary(model[[1]])$coefficients[2, 4] # Extract p-value for fresh
) %>%
filter(p_value < 0.05) # Keep only significant regressions
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `model = list(lm(sea ~ fresh, data = cur_data()))`.
## ℹ In group 1: `watertype = "Freshwater"` and `passage = 1`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
## `summarise()` has grouped output by 'watertype'. You can override using the
## `.groups` argument.
# Merge with original data to keep only groups with significant regressions
df_filtered <- df_dis %>% inner_join(regression_results, by = c("watertype", "passage"))
# plot distance to two source communities
(p <- ggplot(df_dis, aes(x = fresh, y = sea)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkred", alpha = 0.5, size = 0.8) + # y = x line
geom_point(aes(fill = mix_ratio, shape = watertype), size = 2, stroke = 0.1, color = "black", shape = 21, alpha = 0.6) +
# geom_smooth(data = df_filtered, aes(x = fresh, y = sea), method = "lm", formula = y ~ x, se = FALSE) + # Regression line only for significant ones
# stat_regline_equation(data = df_filtered, aes(label = paste(..eq.label.., sep = "~~~")), size = 3) + # Regression equation
# stat_cor(data = df_filtered, aes(label = paste("p =", p_value)), size = 3) + # P-value
facet_grid(watertype ~ passage) +
labs(x = "Aitchison to freshwater source communities", y = "Aitchison to seawater source communities", title = "") +
scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio \n(fresh:sea)") +
xlim(8.1, 24) + # Set x-axis limits
ylim(8.1, 24) + # Set y
mytheme)
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
# ggsave(paste0(folder_path, "distance_to_source_coms.png"), plot = p, width = 9, height = 3.3)
# Summarize mean and standard error for fresh and sea
df_summary <- df_fs %>%
select(-c(mix, mix_simple)) %>%
group_by(watertype, mix_ratio, passage, source_com) %>% # Group by relevant factors
summarise(
mean = mean(Freq, na.rm = TRUE),
se = sd(Freq, na.rm = TRUE) / sqrt(n())) %>%
ungroup() %>%
arrange(passage, mix_ratio) %>%
pivot_wider(names_from = source_com, values_from = c(mean, se), names_glue = "{.value}_{source_com}") %>%
mutate(mix_ratio = factor(mix_ratio, levels = c("0:100", "25:75", "50:50", "75:25", "100:0")))
## `summarise()` has grouped output by 'watertype', 'mix_ratio', 'passage'. You
## can override using the `.groups` argument.
summary(df_summary)
## watertype mix_ratio passage mean_fresh mean_sea
## Length:60 0:100:12 1:10 Min. :12.37 Min. :10.06
## Class :character 25:75:12 2:10 1st Qu.:14.69 1st Qu.:15.19
## Mode :character 50:50:12 3:10 Median :18.95 Median :18.65
## 75:25:12 4:10 Mean :18.20 Mean :17.91
## 100:0:12 5:10 3rd Qu.:20.73 3rd Qu.:20.47
## 6:10 Max. :23.27 Max. :22.50
## se_fresh se_sea
## Min. :0.1906 Min. :0.1726
## 1st Qu.:0.3079 1st Qu.:0.3049
## Median :0.3765 Median :0.3962
## Mean :0.4394 Mean :0.4323
## 3rd Qu.:0.5132 3rd Qu.:0.5120
## Max. :1.2787 Max. :1.0076
# y = -x+a
a <- max(df_summary$mean_fresh) + min(df_summary$mean_sea)
min_xylim <- min(min(df_summary$mean_sea), min(df_summary$mean_fresh))
max_xylim <- max(max(df_summary$mean_sea), max(df_summary$mean_fresh))
# extract the color codes from the "RdYlBu" palette using the RColorBrewer
# display.brewer.pal(5, "RdYlBu")
# (colors <- brewer.pal(5, "RdYlBu"))
colors <- c("#D7191C", "#FDAE61", "gold2", "#ABD9E9", "#2C7BB6")
# Plot mean and standard error with connections
(p <- ggplot(df_summary, aes(x = mean_fresh, y = mean_sea, shape = passage)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkred", alpha = 0.2, size = 0.4) + # y = x line
# geom_abline(slope = -1, intercept = a, linetype = "dashed", color = "darkblue", alpha = 0.2, size = 0.4) + # y = x line
geom_errorbar(aes(ymin = mean_sea - se_sea, ymax = mean_sea + se_sea, color = mix_ratio), linewidth = 0.2, width = 0.1, alpha = 0.1, position = position_dodge(0.3)) + # SE as error bars
geom_errorbarh(aes(xmin = mean_fresh - se_fresh, xmax = mean_fresh + se_fresh, color = mix_ratio), linewidth = 0.2, height = 0.1, alpha = 0.1, position = position_dodge(0.3)) + # Horizontal SE error bars
geom_path(aes(group = mix_ratio, color = mix_ratio), linewidth = 0.2, alpha= 0.2) +
# geom_point(aes(fill = mix_ratio, shape = watertype), size = 1, stroke = 0.1, alpha= 0.7, color = "black", shape = 21) +
geom_text(aes(label = passage, color = mix_ratio), size = 3) +
# facet_grid(watertype ~ mix_ratio) +
facet_grid(. ~ watertype) +
labs(x = "Aitchison to native freshwater communities", y = "Aitchison to native seawater communities", title = "") +
scale_color_manual(values = colors, name = "Mixing ratio \n(fresh:sea)") +
xlim(min_xylim, max_xylim) +
ylim(min_xylim, max_xylim) +
mytheme)
## Warning: `position_dodge()` requires non-overlapping x intervals.
## Warning: `position_dodge()` requires non-overlapping x intervals.
## `position_dodge()` requires non-overlapping x intervals.
## `position_dodge()` requires non-overlapping x intervals.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
# ggsave(paste0(folder_path, "distance_to_source_coms_mean_se_with_lines_facet1.png"), plot = p, width = 5.5, height = 2.8)
# ggsave(paste0(folder_path, "distance_to_source_coms_mean_se_with_lines_nr_raw2.pdf"), plot = p, width = 5.5, height = 2.8)
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Monterey 12.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-3 broom_1.0.7 ggpmisc_0.6.1 ggpp_0.5.8-1
## [5] circlize_0.4.16 phyloseq_1.50.0 ggpubr_0.6.0 vegan_2.6-8
## [9] lattice_0.22-6 permute_0.9-7 lubridate_1.9.4 forcats_1.0.0
## [13] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [17] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] polynom_1.4-1 rlang_1.1.4 magrittr_2.0.3
## [4] ade4_1.7-22 compiler_4.4.2 mgcv_1.9-1
## [7] vctrs_0.6.5 reshape2_1.4.4 quantreg_5.99.1
## [10] pkgconfig_2.0.3 shape_1.4.6.1 crayon_1.5.3
## [13] fastmap_1.2.0 backports_1.5.0 XVector_0.46.0
## [16] labeling_0.4.3 utf8_1.2.4 rmarkdown_2.29
## [19] tzdb_0.4.0 UCSC.utils_1.2.0 MatrixModels_0.5-3
## [22] xfun_0.52 zlibbioc_1.52.0 cachem_1.1.0
## [25] GenomeInfoDb_1.42.1 jsonlite_1.8.9 biomformat_1.34.0
## [28] rhdf5filters_1.18.0 Rhdf5lib_1.28.0 parallel_4.4.2
## [31] cluster_2.1.7 R6_2.5.1 bslib_0.8.0
## [34] stringi_1.8.4 car_3.1-3 jquerylib_0.1.4
## [37] Rcpp_1.0.13-1 iterators_1.0.14 knitr_1.49
## [40] IRanges_2.40.1 Matrix_1.7-1 splines_4.4.2
## [43] igraph_2.1.2 timechange_0.3.0 tidyselect_1.2.1
## [46] rstudioapi_0.17.1 abind_1.4-8 yaml_2.3.10
## [49] codetools_0.2-20 plyr_1.8.9 Biobase_2.66.0
## [52] withr_3.0.2 evaluate_1.0.1 survival_3.7-0
## [55] Biostrings_2.74.0 confintr_1.0.2 pillar_1.9.0
## [58] carData_3.0-5 foreach_1.5.2 stats4_4.4.2
## [61] generics_0.1.3 S4Vectors_0.44.0 hms_1.1.3
## [64] munsell_0.5.1 scales_1.3.0 glue_1.8.0
## [67] tools_4.4.2 data.table_1.16.4 SparseM_1.84-2
## [70] ggsignif_0.6.4 rhdf5_2.50.0 grid_4.4.2
## [73] ape_5.8 colorspace_2.1-1 nlme_3.1-166
## [76] GenomeInfoDbData_1.2.13 Formula_1.2-5 cli_3.6.3
## [79] fansi_1.0.6 gtable_0.3.6 rstatix_0.7.2
## [82] sass_0.4.9 digest_0.6.37 BiocGenerics_0.52.0
## [85] farver_2.1.2 htmltools_0.5.8.1 multtest_2.62.0
## [88] lifecycle_1.0.4 httr_1.4.7 GlobalOptions_0.1.2
## [91] MASS_7.3-61